Newer
Older
Digital_Repository / Misc / Mass downloads / harvest.pl
#!/usr/bin/perl

use strict;
use HTTP::OAI;
use POSIX qw(strftime);

my $url = shift;
my $last_date = shift;
my $today = strftime( "%F", localtime );

my $h = new HTTP::OAI::Harvester(baseURL=>$url);
my $response = $h->repository($h->Identify);
if( $response->is_error ) {
		print "Error requesting Identify:\n",
				$response->code . " " . $response->message, "\n";
		exit;
}

# Note: repositoryVersion will always be 2.0, $r->version returns
# the actual version the repository is running
# print "Repository supports protocol version ", $response->version, "\n";

# Version 1.x repositories don't support metadataPrefix,
# but OAI-PERL will drop the prefix automatically
# if an Identify was requested first (as above)
$response = $h->ListIdentifiers(
		metadataPrefix=>'oai_dc',
		from=>$last_date,
		until=>$today,
);

if( $response->is_error ) {
		die("Error harvesting: " . $response->message . "\n");
}

# print "responseDate => ", $response->responseDate, "\n",
# 		"requestURL => ", $response->requestURL, "\n";

# while( my $id = $response->next ) {
# 		printf "%s\t%s\n", $id->identifier, $id->datestamp;
# # 		print " (", $id->status, ")" if $id->status;
# # 		print "\n";
# # 		# Only available from OAI 2.0 repositories
# # 		for( $id->setSpec ) {
# # 				print "\t", $_, "\n";
# # 		}
# }

# Using a handler
$response = $h->ListRecords(
		metadataPrefix=>'oai_dc',
		handlers=>{metadata=>'HTTP::OAI::Metadata::OAI_DC'},
		from=>$last_date,
		until=>$today,
);
while( my $rec = $response->next ) {
		print $rec->identifier, "\t",
				$rec->datestamp, "\n",
				$rec->metadata, "\n";
		print join(',', @{$rec->metadata->dc->{'title'}}), "\n";
}
# if( $rec->is_error ) {
# 		die $response->message;
# }
# 
# # Offline parsing
# $I = HTTP::OAI::Identify->new();
# $I->parse_string($content);
# $I->parse_file($fh);